library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)
library(visNetwork)Lab 13
NEON MAG Community Composition, Diversity, and Quality Across Sites
This tutorial examines NEON MAGs across sites to summarize MAG richness and its relation to soil pH, taxonomic profiles by site, MAG quality (completeness and contamination), and the proportion of high quality MAGs per sample and site.
Step 1: Load and Prepare Data
df <- read_csv("NEON_soilMAGs_soilChem.csv")
head(df) # A tibble: 6 × 43
...1 genomicsSampleID soilMoisture decimalLatitude decimalLongitude elevation
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 BONA_001-O-2023… 1.55 65.2 -147. 374.
2 2 BONA_001-O-2023… 1.55 65.2 -147. 374.
3 3 BONA_001-O-2023… 1.55 65.2 -147. 374.
4 4 BONA_001-O-2023… 1.55 65.2 -147. 374.
5 5 BONA_001-O-2023… 1.55 65.2 -147. 374.
6 6 BONA_001-O-2023… 1.55 65.2 -147. 374.
# ℹ 37 more variables: soilTemp <dbl>, soilInWaterpH <dbl>, soilInCaClpH <dbl>,
# bin_oid <chr>, bin_id <chr>, site <chr>, site_ID <chr>, subplot <chr>,
# layer <chr>, quadrant <lgl>, date <dbl>, img_genome_id <dbl>,
# bin_quality <chr>, bin_lineage <chr>, gtdb_taxonomy_lineage <chr>,
# domain <chr>, phylum <chr>, class <chr>, order <chr>, family <chr>,
# genus <chr>, bin_methods <chr>, created_by <chr>, date_added <date>,
# bin_completeness <dbl>, bin_contamination <dbl>, average_coverage <lgl>, …
Step 2: MAG Richness per Site and Relation to Soil pH
Explanation: Count MAGs per sample and visualize whether samples with different pH have different numbers of recovered MAGs.
richness <- df |>
group_by(genomicsSampleID) |>
# I am doing mean, just so I can get one value per sample ID, since the pH is the same for all of it anyways.
summarise(n_mags = n_distinct(bin_id),
soilInWaterpH = mean(soilInWaterpH, na.rm = TRUE),
.groups = "drop"
) |>
mutate(pH_group = case_when(!is.na(soilInWaterpH) & soilInWaterpH < 5.5 ~ "acidic",
!is.na(soilInWaterpH) ~ "neutral"))
head(richness)# A tibble: 6 × 4
genomicsSampleID n_mags soilInWaterpH pH_group
<chr> <int> <dbl> <chr>
1 BONA_001-O-20230710 32 3.99 acidic
2 BONA_002-O-20230711 31 4.35 acidic
3 BONA_004-O-20230712 40 3.69 acidic
4 BONA_006-O-20230710 29 4.05 acidic
5 BONA_009-O-20230711 16 4.08 acidic
6 BONA_013-O-20230710 31 3.97 acidic
richness |>
ggplot(aes(x = pH_group, y = n_mags)) +
geom_boxplot() + geom_jitter() +
labs(
x = "pH Group (<5.5 = acidic)",
y = "MAG Richness (Distinct bin_id)",
title = "MAG Richness per Sample and Relation to Soil pH"
)# Correlation test,
cor_test <- cor.test(richness$n_mags, richness$soilInWaterpH, use="complete.obs")
print(cor_test)
Pearson's product-moment correlation
data: richness$n_mags and richness$soilInWaterpH
t = -4.8204, df = 291, p-value = 2.311e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3748380 -0.1624033
sample estimates:
cor
-0.2719304
The correlation test shows that MAG richness is negatively correlated with sample pH (r = -0.2719, 95% CI = -0.3748 to -0.1624, n = 293, p = 2.31e-06).
Step 3: Taxonomic Profiling by Site
Explanation: Summarize phylum-level composition per site and plot stacked bars.
tax_profile <- df |>
group_by(site_ID, phylum) |>
summarise(count = n(), .groups = "drop") |>
mutate(rel_abundance = count / sum(count))
ggplot(tax_profile, aes(x = site_ID, y = rel_abundance, fill = phylum)) +
geom_bar(stat = "identity") +
scale_fill_viridis_d(option = "turbo") +
theme_minimal() +
labs(title = "Taxonomic Profile by Site", y = "Relative Abundance") +
theme(axis.text.x = element_text(angle = 60, size = 6, vjust = 1.2, hjust = 1), legend.position = "right")Step 4: MAG Quality vs Soil Chemistry
Explanation: Visualize whether MAG quality/assembly metrics differ across soil gradients.
mag_meta <- df |>
select(bin_id, genomicsSampleID, bin_completeness, bin_contamination, gene_count, scaffold_count, site_ID)
sample_env <- df |>
distinct(genomicsSampleID, .keep_all = TRUE) |>
select(genomicsSampleID, soilInWaterpH, soilMoisture, soilTemp, decimalLatitude, decimalLongitude, site_ID)
mag_env <- mag_meta |>
left_join(sample_env, by = "genomicsSampleID")
head(mag_env)# A tibble: 6 × 13
bin_id genomicsSampleID bin_completeness bin_contamination gene_count
<chr> <chr> <dbl> <dbl> <dbl>
1 3300079481_s0 BONA_001-O-2023… 95.6 4.96 5785
2 3300079481_s1 BONA_001-O-2023… 94.7 1.99 3054
3 3300079481_s101 BONA_001-O-2023… 51.8 9.42 3039
4 3300079481_s11 BONA_001-O-2023… 60.8 0.04 2539
5 3300079481_s15 BONA_001-O-2023… 81.5 0.18 3314
6 3300079481_s2 BONA_001-O-2023… 88.1 0.19 3753
# ℹ 8 more variables: scaffold_count <dbl>, site_ID.x <chr>,
# soilInWaterpH <dbl>, soilMoisture <dbl>, soilTemp <dbl>,
# decimalLatitude <dbl>, decimalLongitude <dbl>, site_ID.y <chr>
# Distribution graphs
p1 <- ggplot(mag_env, aes(x = bin_completeness)) +
geom_histogram(bins = 40, fill = "cornflowerblue", color = "white") +
theme_minimal() + labs(title = "Distribution of Bin Completeness",
x = "Bin Completeness",
y = "Count")
p2 <- ggplot(mag_env, aes(x = bin_contamination)) +
geom_histogram(bins = 40, fill = "tomato", color = "white") +
theme_minimal() + labs(title = "Distribution of Bin Contamination",
x = "Bin Contamination",
y = "Count")
p3 <- ggplot(mag_env, aes(x = scaffold_count)) +
geom_histogram(bins = 40, fill = "darkgreen", color = "white") +
theme_minimal() + labs(title = "Distribution of Scaffold Count",
x = "Scaffold Count",
y = "Count")
p4 <- ggplot(mag_env, aes(x = gene_count)) +
geom_histogram(bins = 40, fill = "purple", color = "white") +
theme_minimal() + labs(title = "Distribution of Gene Count",
x = "Gene Count",
y = "Count")
print(p1); print(p2); print(p3); print(p4)Bin Completeness vs Soil pH
library(hexbin)
# Scatter Bin Completeness vs pH with linear fit
# I used hexbin because it was very hard to read with so many bin ID data points
p_comp_pH <- ggplot(mag_env, aes(x = soilInWaterpH, y = bin_completeness)) +
stat_binhex(bins = 20) +
scale_fill_viridis_c(option = "plasma", breaks = c(1, 5, 10, 20, 30, 40, 50),
labels = c("1", "5", "10", "20", "30", "40", "50")) +
geom_smooth(method = "lm", se = TRUE, color = "white") +
geom_point(alpha = 0) +
theme_minimal() +
labs(x = "pH",
y = "Bin Completeness",
title = "MAG Completeness vs Sample pH")
print(p_comp_pH)# Correlation test for p_comp_pH
df2 <- mag_env %>%
filter(!is.na(bin_completeness), !is.na(soilInWaterpH))
nrow(df2) # sample size used[1] 5534
pearson_res <- cor.test(df2$bin_completeness, df2$soilInWaterpH, method = "pearson")
print(pearson_res)
Pearson's product-moment correlation
data: df2$bin_completeness and df2$soilInWaterpH
t = -13.118, df = 5532, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1991263 -0.1480192
sample estimates:
cor
-0.1736897
# Correlation test results show a weak correlation between bin completeness and soil pH.Bin Contamination vs Soil pH
p_cont_pH <- ggplot(mag_env, aes(x = soilInWaterpH, y = bin_contamination)) +
geom_point(alpha = 0.2, size = 1) +
stat_binhex(bins = 20) +
scale_fill_viridis_c(option = "plasma", breaks = c(5, 10, 20, 30, 40, 50),
labels = c("5", "10", "20", "30", "40", "50")) +
geom_smooth(method = "lm", se = TRUE, color = "white") +
theme_minimal() +
labs(
x = "pH",
y = "Bin Contamination",
title = "MAG Contamination vs Sample pH"
)
print(p_cont_pH)Step 5: Proportion of High Quality MAGs per Sample/Site
Explanation: Computing how many high quality MAGs come from each sample or site.
Creating the Tables
# This is to create the datasets for site and sample
num_of_HQ <- df |>
mutate(is_HQ = as.integer(bin_quality == "HQ"))
sample_summary <- num_of_HQ |>
group_by(genomicsSampleID, site_ID) |>
summarise(
n_MAGs = n_distinct(bin_id),
n_HQ = sum(is_HQ, na.rm = TRUE),
prop_HQ = n_HQ / pmax(n_MAGs, 1),
med_comp = median(bin_completeness, na.rm = TRUE),
med_cont = median(bin_contamination, na.rm = TRUE),
.groups = "drop"
)
site_summary <- sample_summary %>%
group_by(site_ID) %>%
summarise(
n_samples = n(),
total_MAGs = sum(n_MAGs, na.rm = TRUE),
total_HQ = sum(n_HQ, na.rm = TRUE),
prop_HQ = total_HQ / pmax(total_MAGs, 1),
.groups = "drop"
)
head(site_summary)# A tibble: 6 × 5
site_ID n_samples total_MAGs total_HQ prop_HQ
<chr> <int> <int> <int> <dbl>
1 BONA 10 256 18 0.0703
2 CLBJ 9 276 21 0.0761
3 CPER 9 122 2 0.0164
4 DCFS 12 258 14 0.0543
5 DEJU 17 474 29 0.0612
6 DSNY 6 116 2 0.0172
head(sample_summary)# A tibble: 6 × 7
genomicsSampleID site_ID n_MAGs n_HQ prop_HQ med_comp med_cont
<chr> <chr> <int> <int> <dbl> <dbl> <dbl>
1 BONA_001-O-20230710 BONA 32 1 0.0312 66.2 1.9
2 BONA_002-O-20230711 BONA 31 2 0.0645 76.9 2.14
3 BONA_004-O-20230712 BONA 40 5 0.125 76.5 1.66
4 BONA_006-O-20230710 BONA 29 5 0.172 81.0 1.13
5 BONA_009-O-20230711 BONA 16 2 0.125 73.7 1.92
6 BONA_013-O-20230710 BONA 31 0 0 64.0 2.02
Proportion of High Quality MAGs per Sample
Explanation: Create an interactive graph to better visualize data, and be able to sort each sample per site because there are so many different samples.
plot_ly(
data = sample_summary,
x = ~n_MAGs,
y = ~prop_HQ,
color = ~site_ID, # one trace per site (automatic grouping)
size = ~n_MAGs, # size indicates reliability
sizes = c(6, 30), # size range
text = ~paste0(
"sample: ", genomicsSampleID,
"<br>site: ", site_ID,
"<br>n_MAGs: ", n_MAGs,
"<br>prop_HQ: ", sprintf("%.3f", prop_HQ)
),
hoverinfo = "text",
type = "scatter",
mode = "markers"
) |>
layout(
title = "Proportion of High Quality vs Number of MAGs (Click Legend to Highlight a Site)",
xaxis = list(title = "Number of MAGs in Sample"),
yaxis = list(title = "Proportion HQ", tickformat = ".0%"),
legend = list(itemclick = "toggleothers") # click a legend item to show that site only
)Proportion of High Quality MAGs per Site
ggplot(site_summary, aes(x = site_ID, y = prop_HQ)) +
geom_col(fill = "steelblue2") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, vjust = 1.3, hjust = 1.2)) +
labs(
x = "Site ID",
y = "Proportion of HQ MAGs",
title = "Proportion of High Quality MAGs at Each Site"
)